In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [12]:
# Loading in hourly rain data from CSV, parsing the timestamp, and adding it as an index so it's more useful
rain_df = pd.read_csv('data/ohare_full_precip_hourly.csv')
rain_df['DATE'] = pd.to_datetime(rain_df['DATE'])
rain_df = rain_df.set_index(pd.DatetimeIndex(rain_df['DATE']))
print(rain_df.dtypes)
rain_df.head()
Out[12]:
In [13]:
chi_rain_sub = rain_df[:'1955-01-01']
chi_rain_sub['HOURLYPrecip'].max()
Out[13]:
In [14]:
chi_rain_sub = rain_df[rain_df['HOURLYPrecip'] > 0.0]
chi_rain_sub['HOURLYPrecip'].head()
Out[14]:
In [17]:
rain_df = rain_df['1970-09-01':]
rain_df.head()
Out[17]:
In [18]:
# Resampling the dataframe into one hour increments, accessing max because accumulation listed more often than hourly (i.e.
# every 15 minutes) is the total precipitation since the hour began
# Description: http://www1.ncdc.noaa.gov/pub/data/cdo/documentation/LCD_documentation.pdf
chi_rain_series = rain_df['HOURLYPrecip'].resample('1H').max()
print(chi_rain_series.count())
chi_rain_series.head()
Out[18]:
Using rolling time series in pandas to find n-year events. First looking at some for 6 hour interval.
The rolling sum here calculates the sum of observations over a given number of observations over time. Since each observation here is an hour, the window we provide is a number of hours. Each row is then the sum of observations over that number of hours.
If we had the following rows:
And we calculate the rolling sum with a window of 2 hours, the results will be:
Details of the specific cutoffs for each level of n-year storm can be found here: Rainfall Frequency Information Illinois
In [19]:
roll_6_hr = chi_rain_series.rolling(window=6)
roll_6_hr.sum().plot()
Out[19]:
Because it's looking for a count of intervals, the initial counts of events returned could include more than one event per storm. For example, if one storm lasted 8 hours from 1pm to 9pm and rained relatively consistently throughout at a 5-year event level and we're looking for 6 hour intervals, it could count for as many as 3 events.
In [21]:
roll_6 = pd.DataFrame(roll_6_hr.sum())
print('For 6-hour intervals')
print('{} 1-year events for Northeast Illinois'.format(len(roll_6[(roll_6['HOURLYPrecip'] >= 1.88) &
(roll_6['HOURLYPrecip'] < 2.28)])))
print('{} 2-year events for Northeast Illinois'.format(len(roll_6[(roll_6['HOURLYPrecip'] >= 2.28) &
(roll_6['HOURLYPrecip'] < 2.85)])))
print('{} 5-year events for Northeast Illinois'.format(len(roll_6[(roll_6['HOURLYPrecip'] >= 2.85) &
(roll_6['HOURLYPrecip'] < 3.35)])))
print('{} 10-year events for Northeast Illinois'.format(len(roll_6[(roll_6['HOURLYPrecip'] >= 3.35) &
(roll_6['HOURLYPrecip'] < 4.13)])))
print('{} 25-year events for Northeast Illinois'.format(len(roll_6[(roll_6['HOURLYPrecip'] >= 4.13) &
(roll_6['HOURLYPrecip'] < 4.90)])))
print('{} 50-year events for Northeast Illinois'.format(len(roll_6[(roll_6['HOURLYPrecip'] >= 4.90) &
(roll_6['HOURLYPrecip'] < 5.69)])))
print('{} 100-year events for Northeast Illinois'.format(len(roll_6[roll_6['HOURLYPrecip'] >= 5.69])))
In [25]:
roll_6_1yr = roll_6[(roll_6['HOURLYPrecip'] >= 1.88) & (roll_6['HOURLYPrecip'] < 2.28)]
print('{} days with 1-year events in Northeast Illinois'.format(len(roll_6_1yr.groupby(roll_6_1yr.index.date))))
roll_6_1yr.sort_values(by=['HOURLYPrecip'], ascending=False, inplace=True)
roll_6_1yr.head()
Out[25]:
In [26]:
# Many of these are from the same days, but over slightly different intervals as mentioned before
roll_6_2yr = roll_6[(roll_6['HOURLYPrecip'] >= 2.28) & (roll_6['HOURLYPrecip'] < 2.85)]
print('{} days with 2-year events in Northeast Illinois'.format(len(roll_6_2yr.groupby(roll_6_2yr.index.date))))
roll_6_2yr.sort_values(by=['HOURLYPrecip'], ascending=False, inplace=True)
roll_6_2yr.head()
Out[26]:
In [27]:
# Helper function taking the series, window, and list of cutoffs to make this quicker, returns the subset
def rolling_results(rain_series, window, rain_cutoffs):
window_df = pd.DataFrame(rain_series.rolling(window=window).sum())
print('For {}-hour intervals'.format(window))
print('{} 1-year events for Northeast Illinois'.format(len(window_df[(window_df['HOURLYPrecip'] >= rain_cutoffs[0]) &
(window_df['HOURLYPrecip'] < rain_cutoffs[1])])))
print('{} 2-year events for Northeast Illinois'.format(len(window_df[(window_df['HOURLYPrecip'] >= rain_cutoffs[1]) &
(window_df['HOURLYPrecip'] < rain_cutoffs[2])])))
print('{} 5-year events for Northeast Illinois'.format(len(window_df[(window_df['HOURLYPrecip'] >= rain_cutoffs[2]) &
(window_df['HOURLYPrecip'] < rain_cutoffs[3])])))
print('{} 10-year events for Northeast Illinois'.format(len(window_df[(window_df['HOURLYPrecip'] >= rain_cutoffs[3]) &
(window_df['HOURLYPrecip'] < rain_cutoffs[4])])))
print('{} 25-year events for Northeast Illinois'.format(len(window_df[(window_df['HOURLYPrecip'] >= rain_cutoffs[4]) &
(window_df['HOURLYPrecip'] < rain_cutoffs[5])])))
print('{} 50-year events for Northeast Illinois'.format(len(window_df[(window_df['HOURLYPrecip'] >= rain_cutoffs[5]) &
(window_df['HOURLYPrecip'] < rain_cutoffs[6])])))
print('{} 100-year events for Northeast Illinois'.format(len(window_df[window_df['HOURLYPrecip'] >= rain_cutoffs[6]])))
# Gets the subset of the dataframe for the given cutoff index (i.e. 5 year is the third, so cutoff_index would be 3)
def rolling_subset(rain_series, window, rain_cutoffs, cutoff_index):
window_df = pd.DataFrame(rain_series.rolling(window=window).sum())
if cutoff_index <= 6:
return window_df[(window_df['HOURLYPrecip'] >= rain_cutoffs[cutoff_index - 1]) & (roll_6['HOURLYPrecip'] < 2.85)]
if cutoff_index == 7:
return window_df[window_df['HOURLYPrecip'] >= rain_cutoffs[cutoff_index -1]]
In [28]:
cutoffs_12hr = [2.18, 2.64, 3.31, 3.89, 4.79, 5.6, 6.59]
rolling_results(chi_rain_series, 12, cutoffs_12hr)
In [29]:
roll_12_2yr = rolling_subset(chi_rain_series, 12, cutoffs_12hr, 2)
print('{} days with 2-year events for 12 hrs in Northeast Illinois'.format(len(roll_12_2yr.groupby(roll_12_2yr.index.date))))
In [30]:
cutoffs_24hr = [2.51, 3.04, 3.80, 4.47, 5.51, 6.46, 7.58]
rolling_results(chi_rain_series, 24, cutoffs_24hr)
In [31]:
roll_24_1yr = rolling_subset(chi_rain_series, 24, cutoffs_24hr, 1)
print('{} days with 1-year events for 24 hrs in Northeast Illinois'.format(len(roll_24_1yr.groupby(roll_24_1yr.index.date))))
In [32]:
cutoffs_48hr = [2.70, 3.30, 4.09, 4.81, 5.88, 6.84, 8.16]
rolling_results(chi_rain_series, 48, cutoffs_48hr)
In [33]:
roll_48_1yr = rolling_subset(chi_rain_series, 48, cutoffs_48hr, 1)
print('{} days with 1-year events for 48 hrs in Northeast Illinois'.format(len(roll_48_1yr.groupby(roll_48_1yr.index.date))))
Note: Because the rolling window pulls from the same overlapping period multiple times, it makes sense that longer time periods have higher amounts of events at the lower end of the spectrum. They're pulling from the same incidents more times than the shorter intervals can
In [ ]: